{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "f7655ae8",
   "metadata": {
    "colab_type": "text",
    "id": "view-in-github"
   },
   "source": [
    "<a href=\"https://colab.research.google.com/github/tomasonjo/blogs/blob/master/llm/neo4jvector_langchain_deepdive.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "ea7390e8-b3b6-449e-9819-9cbec935fbdf",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "ea7390e8-b3b6-449e-9819-9cbec935fbdf",
    "outputId": "9d6be0d3-dc61-46b9-f440-95e9ea48af90"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: langchain in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (0.1.0)\n",
      "Requirement already satisfied: openai in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (1.6.1)\n",
      "Requirement already satisfied: wikipedia in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (1.4.0)\n",
      "Requirement already satisfied: tiktoken in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (0.5.2)\n",
      "Requirement already satisfied: neo4j in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (5.16.0)\n",
      "Requirement already satisfied: langchain_openai in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (0.0.2.post1)\n",
      "Requirement already satisfied: PyYAML>=5.3 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from langchain) (6.0.1)\n",
      "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from langchain) (2.0.21)\n",
      "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from langchain) (3.9.0)\n",
      "Requirement already satisfied: dataclasses-json<0.7,>=0.5.7 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from langchain) (0.6.3)\n",
      "Requirement already satisfied: jsonpatch<2.0,>=1.33 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from langchain) (1.33)\n",
      "Requirement already satisfied: langchain-community<0.1,>=0.0.9 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from langchain) (0.0.9)\n",
      "Requirement already satisfied: langchain-core<0.2,>=0.1.7 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from langchain) (0.1.7)\n",
      "Requirement already satisfied: langsmith<0.1.0,>=0.0.77 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from langchain) (0.0.77)\n",
      "Requirement already satisfied: numpy<2,>=1 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from langchain) (1.26.2)\n",
      "Requirement already satisfied: pydantic<3,>=1 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from langchain) (1.10.12)\n",
      "Requirement already satisfied: requests<3,>=2 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from langchain) (2.31.0)\n",
      "Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from langchain) (8.2.2)\n",
      "Requirement already satisfied: anyio<5,>=3.5.0 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from openai) (3.5.0)\n",
      "Requirement already satisfied: distro<2,>=1.7.0 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from openai) (1.8.0)\n",
      "Requirement already satisfied: httpx<1,>=0.23.0 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from openai) (0.26.0)\n",
      "Requirement already satisfied: sniffio in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from openai) (1.2.0)\n",
      "Requirement already satisfied: tqdm>4 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from openai) (4.65.0)\n",
      "Requirement already satisfied: typing-extensions<5,>=4.7 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from openai) (4.9.0)\n",
      "Requirement already satisfied: beautifulsoup4 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from wikipedia) (4.12.2)\n",
      "Requirement already satisfied: regex>=2022.1.18 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from tiktoken) (2023.10.3)\n",
      "Requirement already satisfied: pytz in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from neo4j) (2023.3.post1)\n",
      "Requirement already satisfied: attrs>=17.3.0 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (23.1.0)\n",
      "Requirement already satisfied: multidict<7.0,>=4.5 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.0.4)\n",
      "Requirement already satisfied: yarl<2.0,>=1.0 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.9.3)\n",
      "Requirement already satisfied: frozenlist>=1.1.1 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.4.0)\n",
      "Requirement already satisfied: aiosignal>=1.1.2 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.2.0)\n",
      "Requirement already satisfied: idna>=2.8 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from anyio<5,>=3.5.0->openai) (3.4)\n",
      "Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from dataclasses-json<0.7,>=0.5.7->langchain) (3.20.1)\n",
      "Requirement already satisfied: typing-inspect<1,>=0.4.0 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from dataclasses-json<0.7,>=0.5.7->langchain) (0.9.0)\n",
      "Requirement already satisfied: certifi in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from httpx<1,>=0.23.0->openai) (2023.11.17)\n",
      "Requirement already satisfied: httpcore==1.* in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from httpx<1,>=0.23.0->openai) (1.0.2)\n",
      "Requirement already satisfied: h11<0.15,>=0.13 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from httpcore==1.*->httpx<1,>=0.23.0->openai) (0.14.0)\n",
      "Requirement already satisfied: jsonpointer>=1.9 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from jsonpatch<2.0,>=1.33->langchain) (2.1)\n",
      "Requirement already satisfied: packaging<24.0,>=23.2 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from langchain-core<0.2,>=0.1.7->langchain) (23.2)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from requests<3,>=2->langchain) (2.0.4)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from requests<3,>=2->langchain) (2.1.0)\n",
      "Requirement already satisfied: soupsieve>1.2 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from beautifulsoup4->wikipedia) (2.5)\n",
      "Requirement already satisfied: mypy-extensions>=0.3.0 in /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain) (1.0.0)\n"
     ]
    }
   ],
   "source": [
    "!pip install langchain openai wikipedia tiktoken neo4j langchain_openai"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e7e57380-e2a7-4dce-94a9-76f291c49e78",
   "metadata": {
    "id": "e7e57380-e2a7-4dce-94a9-76f291c49e78"
   },
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "from langchain_community.vectorstores.neo4j_vector import Neo4jVector\n",
    "from langchain.document_loaders import WikipediaLoader\n",
    "from langchain_openai import OpenAIEmbeddings\n",
    "from langchain.text_splitter import CharacterTextSplitter\n",
    "from langchain.docstore.document import Document\n",
    "\n",
    "os.environ[\"OPENAI_API_KEY\"] = \"sk-\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "UFQeyPsbqViM",
   "metadata": {
    "id": "UFQeyPsbqViM"
   },
   "source": [
    "# Neo4j x LangChain: Deep dive into the new Vector index implementation\n",
    "## Learn how to customize LangChain's wrapper of Neo4j vector index\n",
    "\n",
    "Neo4j was and is an excellent fit for handling structured information, but it struggled a bit with semantic search due to its brute-force approach. However, the struggle is in the past as Neo4j has introduced a new vector index in version 5.11 designed to efficiently perform semantic search over unstructured text or other embedded data modalities. The newly added vector index makes Neo4j a great fit for most RAG applications as it now works great with both structured and unstructured data.\n",
    "\n",
    "![1_AH05dvGA_7db_EMySc9AAw.png]()\n",
    "\n",
    "This blog post is designed to walk you through all the customization options available in the Neo4j Vector Index implementation in LangChain.\n",
    "\n",
    "## Neo4j Environment setup\n",
    "You need to setup a Neo4j 5.11 or greater to follow along with the examples in this blog post. The easiest way is to start a free instance on [Neo4j Aura](https://neo4j.com/cloud/platform/aura-graph-database/), which offers cloud instances of Neo4j database. Alternatively, you can also setup a local instance of the Neo4j database by downloading the Neo4j Desktop application and creating a local database instance.\n",
    "## Example dataset\n",
    "For the purpose of this blog post, we will use the WikipediaLoader to fetch text from the Witcher page."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "bd210816-659c-4e99-80ed-ce17abd9e409",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "bd210816-659c-4e99-80ed-ce17abd9e409",
    "outputId": "ef093292-93bb-4af3-e504-442ff2f9fe24"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/tomazbratanic/anaconda3/lib/python3.11/site-packages/wikipedia/wikipedia.py:389: GuessedAtParserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"lxml\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n",
      "\n",
      "The code that caused this warning is on line 389 of the file /Users/tomazbratanic/anaconda3/lib/python3.11/site-packages/wikipedia/wikipedia.py. To get rid of this warning, pass the additional argument 'features=\"lxml\"' to the BeautifulSoup constructor.\n",
      "\n",
      "  lis = BeautifulSoup(html).find_all('li')\n"
     ]
    }
   ],
   "source": [
    "# Read the wikipedia article\n",
    "raw_documents = WikipediaLoader(query=\"The Witcher\").load()\n",
    "# Define chunking strategy\n",
    "text_splitter = CharacterTextSplitter.from_tiktoken_encoder(\n",
    "    chunk_size=1000, chunk_overlap=20\n",
    ")\n",
    "# Chunk the document\n",
    "documents = text_splitter.split_documents(raw_documents)\n",
    "for d in documents:\n",
    "    del d.metadata[\"summary\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "UKD-ATnfq7ZO",
   "metadata": {
    "id": "UKD-ATnfq7ZO"
   },
   "source": [
    "## Neo4j Vector index customization\n",
    "Each text chunk is stored in Neo4j as a single isolated node.\n",
    "\n",
    "![Screenshot from 2023-09-06 12-52-01.png]()\n",
    "\n",
    "By default, Neo4j vector index implementation in LangChain represents the documents using the Chunk node label, where the text property stores the text of the document, and the embedding property holds the vector representation of the text. The implementation allows you to customize the node label, text and embedding property names."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "380059c2-4651-4773-a9e4-db5ad63cb06d",
   "metadata": {
    "id": "380059c2-4651-4773-a9e4-db5ad63cb06d"
   },
   "outputs": [],
   "source": [
    "url = \"bolt://54.236.226.158:7687\"\n",
    "username = \"neo4j\"\n",
    "password = \"radiuses-investment-college\"\n",
    "\n",
    "neo4j_db = Neo4jVector.from_documents(\n",
    "    documents,\n",
    "    OpenAIEmbeddings(),\n",
    "    url=url,\n",
    "    username=username,\n",
    "    password=password,\n",
    "    database=\"neo4j\",  # neo4j by default\n",
    "    index_name=\"wikipedia\",  # vector by default\n",
    "    node_label=\"WikipediaArticle\",  # Chunk by default\n",
    "    text_node_property=\"info\",  # text by default\n",
    "    embedding_node_property=\"vector\",  # embedding by default\n",
    "    create_id_index=True,  # True by default\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dlMa9J3-rGuN",
   "metadata": {
    "id": "dlMa9J3-rGuN"
   },
   "source": [
    "In this example, we have specified that we want to store text chunks under the WikipediaArticle node label, where the info property is used to store text, and the vector property holds the text embedding representation. If you run the above examples, you should see the following information in the database.\n",
    "\n",
    "![Screenshot from 2023-09-06 13-07-20.png]()\n",
    "\n",
    "As mentioned, we define the info property to contain the text information, while the vector property is used to store the embedding. Any other properties like the source and title are treated as document metadata.\n",
    "\n",
    "By default, we also create a unique node property constraint on the id property of the specified node label for faster imports. You can verify that the constraint has been created by using the following Cypher statement:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "60087496-a403-4b0f-885f-6ef00601fc46",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "60087496-a403-4b0f-885f-6ef00601fc46",
    "outputId": "3069f374-dbcf-4d35-976f-ab928c0e05e5"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'id': 6,\n",
       "  'name': 'constraint_e5da4d45',\n",
       "  'type': 'UNIQUENESS',\n",
       "  'entityType': 'NODE',\n",
       "  'labelsOrTypes': ['WikipediaArticle'],\n",
       "  'properties': ['id'],\n",
       "  'ownedIndex': 'constraint_e5da4d45',\n",
       "  'propertyType': None}]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "neo4j_db.query(\"SHOW CONSTRAINTS\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "IStQIFXErXhA",
   "metadata": {
    "id": "IStQIFXErXhA"
   },
   "source": [
    "As you would expect, we also create a vector index that will allow us to perform fast ANN searches."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "64963364-d0fc-4888-b10f-e707812cab22",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "64963364-d0fc-4888-b10f-e707812cab22",
    "outputId": "c3d52cfb-ba3c-44b0-dcf7-c920ca119eb2"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'name': 'tasks',\n",
       "  'type': 'VECTOR',\n",
       "  'labelsOrTypes': ['Task'],\n",
       "  'properties': ['embedding'],\n",
       "  'options': {'indexProvider': 'vector-1.0',\n",
       "   'indexConfig': {'vector.dimensions': 1536,\n",
       "    'vector.similarity_function': 'cosine'}}},\n",
       " {'name': 'wikipedia',\n",
       "  'type': 'VECTOR',\n",
       "  'labelsOrTypes': ['WikipediaArticle'],\n",
       "  'properties': ['vector'],\n",
       "  'options': {'indexProvider': 'vector-1.0',\n",
       "   'indexConfig': {'vector.dimensions': 1536,\n",
       "    'vector.similarity_function': 'cosine'}}}]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "neo4j_db.query(\n",
    "    \"\"\"SHOW INDEXES\n",
    "       YIELD name, type, labelsOrTypes, properties, options\n",
    "       WHERE type = 'VECTOR'\n",
    "    \"\"\"\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "KudH2t70rZte",
   "metadata": {
    "id": "KudH2t70rZte"
   },
   "source": [
    "The LangChain implementation created a vector index named wikipedia , which indexes the vector property of WikipediaArticle nodes. Additionally, the provided configuration informs us that the vector embedding dimension is 1536 and uses the cosine similarity function.\n",
    "## Loading additional documents\n",
    "You can use the add_documents method to load additional documents into an instantiated vector index."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "30bae957-6e92-46fa-a67b-4e74f443d1f9",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "30bae957-6e92-46fa-a67b-4e74f443d1f9",
    "outputId": "6c6e0e1b-6443-48ec-9acb-3554c4d788a2"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['langchain']"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "neo4j_db.add_documents(\n",
    "    [\n",
    "        Document(\n",
    "            page_content=\"LangChain is the coolest library since the Library of Alexandria\",\n",
    "            metadata={\"author\": \"Tomaz\", \"confidence\": 1.0}\n",
    "        )\n",
    "    ],\n",
    "    ids=[\"langchain\"],\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "k87HR9B2rcaX",
   "metadata": {
    "id": "k87HR9B2rcaX"
   },
   "source": [
    "LangChain allows you to provide document ids to the add_document method, which can be used to sync information across different system and make it easier to update or delete relevant text chunks.\n",
    "\n",
    "## Loading existing index\n",
    "If you have an existing vector index in Neo4j with populated data, you can use the from_existing_method to connect to it."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "7e9c944e-f356-4839-a30e-2cbd4cc46e4c",
   "metadata": {
    "id": "7e9c944e-f356-4839-a30e-2cbd4cc46e4c"
   },
   "outputs": [],
   "source": [
    "existing_index = Neo4jVector.from_existing_index(\n",
    "    OpenAIEmbeddings(),\n",
    "    url=url,\n",
    "    username=username,\n",
    "    password=password,\n",
    "    index_name=\"wikipedia\",\n",
    "    text_node_property=\"info\",  # Need to define if it is not default\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "YEh2bwHArhzw",
   "metadata": {
    "id": "YEh2bwHArhzw"
   },
   "source": [
    "First, the from_existing_method checks if the index with the provided name actually exists in the database. If it exists, it can retrieve the node label and embedding node property from index configuration map, which means that you don't have to manually set those."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "9610e8d4-8d42-4de1-b1b2-534bf3e2be78",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "9610e8d4-8d42-4de1-b1b2-534bf3e2be78",
    "outputId": "747f9587-f6b6-4d6e-8dd8-6664caf1e0c0"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WikipediaArticle\n",
      "vector\n"
     ]
    }
   ],
   "source": [
    "print(existing_index.node_label)\n",
    "print(existing_index.embedding_node_property)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "UJCpTKwarkcC",
   "metadata": {
    "id": "UJCpTKwarkcC"
   },
   "source": [
    "However, the index information does not contain the text node property information. Therefore, if you use any property besides the default one (text), specify it using the text_node_property parameter.\n",
    "## Custom retrieval queries\n",
    "Since Neo4j is a native graph database, the vector index implementation in LangChain allows customization and enrichment of the returned information. However, this feature is intended for more advanced users as you are responsible for custom data loading as well as retrieval.\n",
    "The retrieval_query parameter allows you to collect, transform, or calculate any additional graph information you want to return from the similarity search. To better understand it, we can look at the actual implementation in the code.\n",
    "\n",
    "```\n",
    "read_query = (\n",
    "    \"CALL db.index.vector.queryNodes($index, $k, $embedding) \"\n",
    "    \"YIELD node, score \"\n",
    ") + retrieval_query\n",
    "```\n",
    "\n",
    "From the code, we can observe that the vector similarity search is hardcoded. However, we then have the option to add any intermediate steps and return additional information. The retrieval query must return the following three columns:\n",
    "* text (String): This is usually the textual data that is associated with the node that has been retrieved. This could be the main content of the node, a name, a description, or any other text-based information.\n",
    "* score (Float): This represents the similarity score between the query vector and the vector associated with the returned node. The score quantifies how similar the query is to the returned nodes, often on a scale from 0 to 1\n",
    "* metadata (Dictionary): This is a more flexible column that can contain additional information about the node or the search. It can be a dictionary (or map) that includes various attributes or properties that give more context to the returned node.\n",
    "\n",
    "We will add a relationship to a WikipediaArticlenode to demonstrate this functionality."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "82adf310-3eea-4181-bfa1-6fd773b7c254",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "82adf310-3eea-4181-bfa1-6fd773b7c254",
    "outputId": "ecf5570b-ad69-4b2e-b898-cb2b552ef20c"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "existing_index.query(\n",
    "    \"\"\"MATCH (w:WikipediaArticle {id:'langchain'})\n",
    "       MERGE (w)<-[:EDITED_BY]-(:Person {name:\"Galileo\"})\n",
    "    \"\"\"\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "M2_ZWf90rurR",
   "metadata": {
    "id": "M2_ZWf90rurR"
   },
   "source": [
    "We have added an EDITED_BY relationship to the WikipediaArticle node with the given id. Let's now test out a custom retrieval option."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "0ee14d6b-745a-44cb-95ae-64f855ff6c2b",
   "metadata": {
    "id": "0ee14d6b-745a-44cb-95ae-64f855ff6c2b"
   },
   "outputs": [],
   "source": [
    "retrieval_query = \"\"\"\n",
    "OPTIONAL MATCH (node)<-[:EDITED_BY]-(p)\n",
    "WITH node, score, collect(p) AS editors\n",
    "RETURN node.info AS text,\n",
    "       score,\n",
    "       node {.*, vector: Null, info: Null, editors: editors} AS metadata\n",
    "\"\"\"\n",
    "\n",
    "existing_index_return = Neo4jVector.from_existing_index(\n",
    "    OpenAIEmbeddings(),\n",
    "    url=url,\n",
    "    username=username,\n",
    "    password=password,\n",
    "    database=\"neo4j\",\n",
    "    index_name=\"wikipedia\",\n",
    "    text_node_property=\"info\",\n",
    "    retrieval_query=retrieval_query,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "l_mTCryOrwqM",
   "metadata": {
    "id": "l_mTCryOrwqM"
   },
   "source": [
    "I won't go too much into the specifics of Cypher. You can use many resources to learn the basic syntax and more like the Neo4j Graph Academy. To construct a valid retrieval query, you must know that the relevant node from the vector similarity search is available under the node reference variable, while the similarity metric value is available under the score reference.\n",
    "\n",
    "Let's try it out."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "f6f70333-2148-441a-a300-dfec69981451",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "f6f70333-2148-441a-a300-dfec69981451",
    "outputId": "1073f0f7-7b9b-4bc8-adde-26f8d1a132bf"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Document(page_content='LangChain is the coolest library since the Library of Alexandria', metadata={'id': 'langchain', 'author': 'Tomaz', 'editors': [{'name': 'Galileo'}], 'confidence': 1.0})]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "existing_index_return.similarity_search(\"What do you know about LangChain?\", k=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "483f76f1-e319-4f79-9600-532cb409ee13",
   "metadata": {
    "id": "483f76f1-e319-4f79-9600-532cb409ee13"
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "include_colab_link": true,
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
